Code to reproduce the figures and analysis in our paper
knitr::opts_chunk$set( fig.width = 12, fig.height=6 )
library(rstan)
library(rstanarm)
library(PRROC)
library(LaplacesDemon)
library(tidyverse)
library(bayesplot)
library(RColorBrewer)
library(gridExtra)
library(grid)
library(PRROC)
library(rlang)
myseed <- 3421237
setwd("/Users/annasmith/GitHub Repos/PredictionScoring/")
source("predScore4_alter_102919.R") ## prediction scoring functions
source("bayesplot_compare_012420.R") ## plots for model comparison
source("plotHelper.R") ## plotting functions for this dataset
Code available upon request. Uses the same functions contained in “predScore4_alter_102919.R”
The data is freely available at the following link: (https://github.com/gal-zz-lup/NGS2) [https://github.com/gal-zz-lup/NGS2].
## copy links for raw dataset files
gallup_coop <- url("https://raw.githubusercontent.com/gal-zz-lup/NGS2/master/cooperation_exp1_FINAL.csv")
gallup_rewire <- url("https://raw.githubusercontent.com/gal-zz-lup/NGS2/master/rewire_exp1.csv")
Preregistration data:
temp <- tempfile()
download.file("http://davidrand-cooperation.com/s/Rand2011PNAS_data_and_code-pi6b.zip",
temp, mode="wb")
gallup1 <- read.table(unz(temp,"Rand2011PNAS_cooperation_data.txt"),
sep="\t",skip=0,header=T)
unlink(temp)
Cycle 1 data:
gallup2 <- read.csv(gallup_coop,header = TRUE,sep = ',')
Perform some data cleaning tasks, including making variable names match and preparing round-level data.
## Make variable names match
names(gallup2)[names(gallup2) %in%
c("round","pid","action")] <- c("round_num",
"playerid","decision..0.D.1.C.")
gallup2$sessionF <- as.factor(gallup2$session)
gallup2$sessionnum <- unlist(lapply(1:nrow(gallup2), function(i){
which(gallup2$sessionF[i]==levels(gallup2$sessionF)) }))
## prep gallup datasets
gallup1rounds <- prepRoundData(gallup1)
gallup2rounds <- prepRoundData(gallup2)
## gallup2 has more rounds - add a dummy max round to gallup1
nrounds_gallup1 <- max(gallup1$round_num)
nrounds_gallup2 <- max(gallup2$round_num)
addon <- gallup1rounds$session_round_rate[1:(nrounds_gallup2-nrounds_gallup1),]
addon$round_num <- (nrounds_gallup1 + 1):nrounds_gallup2
addon$rate_contr <- NA
addon$num_player <- NA
gallup1rounds$session_round_rate <- rbind( gallup1rounds$session_round_rate,
addon )
g.pre <- plotContr(gallup1rounds,title="Preregistration Data")
g.cyc1 <- plotContr(gallup2rounds,title="Cycle 1 Data")
grid.arrange(g.pre,g.cyc1,nrow=2)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
## Warning: Removed 4 row(s) containing missing values (geom_path).
hyp1.4 <- as.formula(decision..0.D.1.C. ~ fluid_dummy + round_num +
fluid_dummy*round_num)
## Model with preregistration data
model1.4 <- stan_glm( hyp1.4, data = gallup1,
family = binomial(link = "logit") )
model1.4_freq <- glm( hyp1.4, data=gallup1, family="binomial" )
## Model with Cycle 1 data
model2.4 <- stan_glm( hyp1.4, data = gallup2,
family = binomial(link = "logit") )
model2.4_freq <- glm( hyp1.4, data=gallup2, family="binomial" )
paramMat <- rbind( c(rbind( coef(model1.4),
round(summary(model1.4_freq)$coefficients[,'Pr(>|z|)'],4)),
nrow(gallup1)),
c(rbind( coef(model2.4),
round(summary(model2.4_freq)$coefficients[,'Pr(>|z|)'],4)),
nrow(gallup2)) )
rownames(paramMat) <- c("preregistration","cycle 1")
colnames(paramMat) <- c(rbind( names(coef(model1.4)),
rep("pval",length(coef(model1.4))) ), "n")
paramMat
## (Intercept) pval fluid_dummy pval round_num pval
## preregistration 0.6996307 0 -0.1585307 0.2200 -0.1706102 0
## cycle 1 2.0077361 0 0.7138259 0.0401 -0.2331044 0
## fluid_dummy:round_num pval n
## preregistration 0.1357853 0e+00 3876
## cycle 1 0.2228711 1e-04 1192
g.comp_pars <- mcmc_intervals_compare(model1.4,model2.4,
pars="fluid_dummy:round_num",
model1.name="Pilot Data",
model2.name="Experimental Data",
colorlist = brewer.pal(7,"BrBG")[c(2,6,3,5)]) +
scale_y_discrete(labels=c("",bquote(beta[4]))) +
theme(legend.title = element_text(size=9),
legend.text = element_text(size=9))
g.comp_full <- mcmc_intervals_compare(model1.4,model2.4,
model1.name="Pilot Data",
model2.name="Experimental Data",
colorlist = brewer.pal(7,"BrBG")[c(2,6,3,5)]) +
theme(legend.title = element_text(size=9),
legend.text = element_text(size=9))
grid.arrange(g.comp_pars,g.comp_full,ncol=2)
trad_preds <- getCurves( model1.4, gallup2 )
wtd_preds <- getCurves( model1.4, gallup2, old_data=gallup1, wtd=TRUE )
rocdf <- rbind( trad_preds$rocdf, wtd_preds$rocdf )
prdf <- rbind( trad_preds$prdf, wtd_preds$prdf )
plot_pred_roc <- ggplot(rocdf,aes(V1,V2,col=wtd,pch=wtd)) +
geom_point() + geom_line() +
scale_color_manual(values=c("goldenrod1","orangered")) +
geom_abline(slope=1,color="grey",lty=2) +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
labs(title="ROC", x="False positive rate", y="True positive rate")
plot_pred_pr <- ggplot(prdf,aes(V1,V2,col=wtd,pch=wtd)) +
geom_point() + geom_line() +
scale_color_manual(values=c("goldenrod1","orangered")) +
ylim(c(0,1)) + labs(title="Precision-Recall", x="Recall", y="Precision") +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none")
grid.arrange(plot_pred_roc,plot_pred_pr,ncol=2)
g.pre.model <- plotModel(gallup1rounds$session_round_rate,
coef(model1.4_freq), title="Pilot data")
g.cyc1.model <- plotModel(gallup2rounds$session_round_rate,
coef(model2.4_freq), title="Experimental data")
grid.arrange(g.pre.model,g.cyc1.model,ncol=2)
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).
Each of the following versions of our prediction scoring method takes ~ 10 minutes to run. ### Unweighted version
set.seed(myseed)
val_1.4 <- predscore( model1.4, newdata=gallup2, Ksize=50 )
plots <- with(val_1.4, plot.predscore(cv,val,isBin=TRUE,plot=FALSE))
ps_pr <- plots[[4]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
scale_fill_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
scale_color_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
ggtitle("Precision-Recall")
ps_roc <- plots[[2]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
scale_fill_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
scale_color_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
ggtitle("ROC") + geom_abline(slope=1,color="grey50",lty=2)
ps_pr_auc <- plots[[8]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
scale_fill_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
scale_color_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
ggtitle("Precision-Recall") + labs(x="AUC Statistics")
ps_roc_auc <- plots[[7]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
scale_fill_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
scale_color_manual(values=c(brewer.pal(7,"BrBG")[6],"goldenrod1")) +
ggtitle("ROC") + labs(x="AUC Statistics")
grid.arrange(ps_pr,ps_pr_auc,
ps_roc,ps_roc_auc)
## Warning: Removed 3 rows containing missing values (geom_point).
ks.test( val_1.4$cv$results$q, val_1.4$val$results$q )
##
## Exact two-sample Kolmogorov-Smirnov test
##
## data: val_1.4$cv$results$q and val_1.4$val$results$q
## D = 0.93506, p-value = 8.993e-14
## alternative hypothesis: two-sided
Observations are resampled to match the marginal distributions.
set.seed(myseed)
val_wtd_1.4 <- predscore( model1.4, newdata=gallup2, Ksize=50,
wtd=TRUE )
plots_wtd <- with(val_wtd_1.4, plot.predscore(cv,val,isBin=TRUE,plot=FALSE))
ps_wtd_pr <- plots_wtd[[4]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
ggtitle("Precision-Recall")
ps_wtd_roc <- plots_wtd[[2]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
ggtitle("ROC") + geom_abline(slope=1,color="grey50",lty=2)
ps_wtd_pr_auc <- plots_wtd[[8]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
ggtitle("Precision-Recall") + labs(x="AUC Statistics")
ps_wtd_roc_auc <- plots_wtd[[7]] +
theme_classic() + theme(text=element_text(family="Times"),
panel.grid.major = element_line("gray90"),
panel.grid.minor = element_line("gray95"),
legend.position = "none") +
ggtitle("ROC") + labs(x="AUC Statistics")
grid.arrange(ps_wtd_pr,ps_wtd_pr_auc,
ps_wtd_roc,ps_wtd_roc_auc)
## Warning: Removed 1 rows containing missing values (geom_point).
ks.test( val_wtd_1.4$cv$results$q, val_wtd_1.4$val$results$q )
##
## Exact two-sample Kolmogorov-Smirnov test
##
## data: val_wtd_1.4$cv$results$q and val_wtd_1.4$val$results$q
## D = 0.2987, p-value = 0.001957
## alternative hypothesis: two-sided